Here I train the 16x video classifiers with the merged data, i.e. using stock culture data from every time point and light setting recorded.
library(tidyverse)
library(randomForest)
library(readr)
library(viridis)
library(e1071)
library(caret)
library(patchwork)
library(parallel)
library(doParallel)
Note: there are only few individuals of the small ciliates because only cells bigger than 1000um2 are tracked at this magnification and most small ciliates (Tetrahymena, Dexiostoma and Loxocephalus) are smaller than that.
Note: there are only few individuals of the small ciliates because only cells bigger than 1000um2 are tracked at this magnification and most small ciliates (Tetrahymena, Dexiostoma and Loxocephalus) are smaller than that.
load("../../Class_18C_normalLight/1st_class_2020/data/16x/Coleps_irchel/Morph_mvt.RData")
dd16 <- morph_mvt
load("../../Class_18C_normalLight/1st_class_2020/data/16x/Colpidium/Morph_mvt.RData")
morph_mvt$species <- "Colpidium"
dd16 <- rbind(dd16, morph_mvt)
load("../../Class_18C_normalLight/1st_class_2020/data/16x/Euplotes/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)
load("../../Class_18C_normalLight/1st_class_2020/data/16x/Euplotes_second/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)
load("../../Class_18C_normalLight/1st_class_2020/data/16x/Paramecium_bursaria/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)
load("../../Class_18C_normalLight/1st_class_2020/data/16x/Paramecium_caudatum/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)
load("../../Class_18C_normalLight/1st_class_2020/data/16x/Stylonychia_1/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia1"
dd16 <- rbind(dd16, morph_mvt)
load("../../Class_18C_normalLight/1st_class_2020/data/16x/Stylonychia_1_second/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia1"
dd16 <- rbind(dd16, morph_mvt)
load("../../Class_18C_normalLight/1st_class_2020/data/16x/Stylonychia_2/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia2"
dd16 <- rbind(dd16, morph_mvt)
load("../../Class_18C_normalLight/1st_class_2020/data/16x/Dexiostoma/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)
load("../../Class_18C_normalLight/1st_class_2020/data/16x/Loxocephallus/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)
load("../../Class_18C_normalLight/1st_class_2020/data/16x/Tetrahymena/Morph_mvt.RData")
morph_mvt$species <- "Tetrahymena"
dd16 <- rbind(dd16, morph_mvt)
# filtering
dd16 <- dd16 %>%
filter(net_disp>50, net_speed>5, N_frames>10)
dd16$magnification <- 1.6
# table(dd16$species)
load("../../Class_18C_normalLight/2nd_class_2022Feb/data/16x/Coleps_irchel/Morph_mvt.RData")
dd16_2022feb <- morph_mvt
load("../../Class_18C_normalLight/2nd_class_2022Feb/data/16x/Colpidium/Morph_mvt.RData")
morph_mvt$species <- "Colpidium"
dd16_2022feb <- rbind(dd16_2022feb, morph_mvt)
load("../../Class_18C_normalLight/2nd_class_2022Feb/data/16x/Dexiostoma/Morph_mvt.RData")
dd16_2022feb <- rbind(dd16_2022feb, morph_mvt)
load("../../Class_18C_normalLight/2nd_class_2022Feb/data/16x/Euplotes/Morph_mvt.RData")
dd16_2022feb <- rbind(dd16_2022feb, morph_mvt)
load("../../Class_18C_normalLight/2nd_class_2022Feb/data/16x/Loxochephalus//Morph_mvt.RData")
dd16_2022feb <- rbind(dd16_2022feb, morph_mvt)
load("../../Class_18C_normalLight/2nd_class_2022Feb/data/16x/Paramecium_bursaria/Morph_mvt.RData")
# morph_mvt %>%
# ggplot(aes(mean_area))+
# geom_histogram()+
# geom_vline(xintercept = 3000)
morph_mvt <- morph_mvt %>% dplyr::filter(mean_area>3000)
dd16_2022feb <- rbind(dd16_2022feb, morph_mvt)
load("../../Class_18C_normalLight/2nd_class_2022Feb/data/16x/Paramecium_caudatum/Morph_mvt.RData")
dd16_2022feb <- rbind(dd16_2022feb, morph_mvt)
load("../../Class_18C_normalLight/2nd_class_2022Feb/data/16x/Stylonychia2_batch1/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia2_batch1"
dd16_2022feb <- rbind(dd16_2022feb, morph_mvt)
load("../../Class_18C_normalLight/2nd_class_2022Feb/data/16x/Stylonychia2_batch2/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia2_batch2"
dd16_2022feb <- rbind(dd16_2022feb, morph_mvt)
load("../../Class_18C_normalLight/2nd_class_2022Feb/data/16x/Tetrahymena/Morph_mvt.RData")
dd16_2022feb <- rbind(dd16_2022feb, morph_mvt)
# filtering
dd16_2022feb <- dd16_2022feb %>%
filter(net_disp>50, net_speed>5, N_frames>10)
dd16_2022feb$magnification <- 1.6
# table(dd16_2022feb$species)
load("../../Class_18C_normalLight/3rd_data_20220301/data/16x/large_ciliates/Morph_mvt.RData")
dd16_20220301 <- morph_mvt
load("../../Class_18C_normalLight/3rd_data_20220301/data/16x/small_ciliates/Morph_mvt.RData")
dd16_20220301 <- rbind(dd16_20220301, morph_mvt)
# filtering
dd16_20220301 <- dd16_20220301 %>%
filter(net_disp>50, net_speed>5, N_frames>10)
dd16_20220301$magnification <- 1.6
# dd16_20220301 %>%
# ggplot(aes(mean_area))+
# geom_histogram()+
# facet_wrap(~species, scales = "free")
dd16_20220301 <- dd16_20220301 %>%
dplyr::filter(!(species %in% c("Paramecium_caudatum","Paramecium_bursaria","Euplotes") & mean_area<2000),
!(species %in% c("Dexiostoma","Tetrahymena","Loxocephallus") & mean_area>1500))
# table(dd16_20220301$species)
load("../../Class_18C_normalLight/4th_data_20220316/data/16x/large_ciliates/Morph_mvt.RData")
dd16_20220316 <- morph_mvt
load("../../Class_18C_normalLight/4th_data_20220316/data/16x/small_ciliates/Morph_mvt.RData")
dd16_20220316 <- rbind(dd16_20220316, morph_mvt)
# filtering
dd16_20220316 <- dd16_20220316 %>%
filter(net_disp>50, net_speed>5, N_frames>10)
dd16_20220316$magnification <- 1.6
# dd16_20220316 %>%
# ggplot(aes(mean_area))+
# geom_histogram()+
# facet_wrap(~species, scales = "free")
dd16_20220316 <- dd16_20220316 %>%
dplyr::filter(!(species %in% c("Paramecium_caudatum","Paramecium_bursaria","Euplotes") & mean_area<2000),
!(species %in% c("Dexiostoma","Tetrahymena","Loxocephallus") & mean_area>1400))
# table(dd16_20220316$species)
load("../../Class_18C_normalLight/5th_data_20220406/data/16x/large_ciliates/Morph_mvt.RData")
dd16_20220406 <- morph_mvt
load("../../Class_18C_normalLight/5th_data_20220406/data/16x/small_ciliates/Morph_mvt.RData")
dd16_20220406 <- rbind(dd16_20220406, morph_mvt)
# filtering
dd16_20220406 <- dd16_20220406 %>%
filter(net_disp>50, net_speed>5, N_frames>10)
dd16_20220406$magnification <- 1.6
# dd16_20220406 %>%
# ggplot(aes(mean_area))+
# geom_histogram()+
# facet_wrap(~species, scales = "free")
dd16_20220406 <- dd16_20220406 %>%
dplyr::filter(!(species %in% c("Paramecium_caudatum","Paramecium_bursaria","Euplotes") & mean_area<2000),
!(species %in% c("Dexiostoma","Tetrahymena","Loxocephallus") & mean_area>1400))
# table(dd16_20220406$species)
load("../../Class_18C_normalLight/6th_data_20220706/data/16x/large_ciliates/Morph_mvt.RData")
dd16_20220706 <- morph_mvt
load("../../Class_18C_normalLight/6th_data_20220706/data/16x/small_ciliates/Morph_mvt.RData")
dd16_20220706 <- rbind(dd16_20220706, morph_mvt)
# filtering
dd16_20220706 <- dd16_20220706 %>%
filter(net_disp>50, net_speed>5, N_frames>10)
dd16_20220706$magnification <- 1.6
# dd16_20220706 %>%
# ggplot(aes(mean_area))+
# geom_histogram()+
# facet_wrap(~species, scales = "free")
dd16_20220706 <- dd16_20220706 %>%
dplyr::filter(!(species %in% c("Paramecium_bursaria","Euplotes") & mean_area<2000),
!(species %in% c("Dexiostoma","Tetrahymena","Loxocephallus") & mean_area>1400))
# table(dd16_20220706$species)
load("../../Class_18C_decreasingLight/Light_18perc/data/16x/Euplotes/Morph_mvt.RData")
dd16_2022feb_decrease <- morph_mvt
load("../../Class_18C_decreasingLight/Light_18perc/data/16x/Paramecium_bursaria/Morph_mvt.RData")
a <- morph_mvt %>%
ggplot(aes(mean_area))+
geom_histogram()
Pburs_dark_16x <- morph_mvt %>%
dplyr::select(mean_area,sd_area,mean_perimeter,sd_perimeter,mean_major,
sd_major,mean_ar,sd_ar,
mean_turning,sd_turning,gross_disp,max_net,net_disp,net_speed,max_step,
min_step,sd_step,sd_gross_speed,max_gross_speed,min_gross_speed)
set.seed(23)
Pburs_dark_16x_clust <- kmeans(Pburs_dark_16x, centers = 2, nstart = 25, iter.max = 1000)
Pburs_dark_16x$cluster <- as.factor(Pburs_dark_16x_clust$cluster)
b <- Pburs_dark_16x %>%
ggplot(aes(mean_area, col=cluster, fill=cluster))+
geom_histogram(position = "identity", alpha=0.3)
Pburs_dark_16x_clust <- kmeans(Pburs_dark_16x, centers = 3, nstart = 25, iter.max = 1000)
Pburs_dark_16x$cluster <- as.factor(Pburs_dark_16x_clust$cluster)
c <- Pburs_dark_16x %>%
ggplot(aes(mean_area, col=cluster, fill=cluster))+
geom_histogram(position = "identity", alpha=0.3)
Pburs_dark_16x_clust <- kmeans(Pburs_dark_16x, centers = 4, nstart = 25, iter.max = 1000)
Pburs_dark_16x$cluster <- as.factor(Pburs_dark_16x_clust$cluster)
d <- Pburs_dark_16x %>%
ggplot(aes(mean_area, col=cluster, fill=cluster))+
geom_histogram(position = "identity", alpha=0.3)
morph_mvt <- morph_mvt %>%
dplyr::mutate(cluster=as.factor(Pburs_dark_16x_clust$cluster)) %>%
dplyr::filter(cluster !="2") %>%
dplyr::select(-cluster)
e <- morph_mvt %>%
ggplot(aes(mean_area))+
geom_histogram()
# a + b + c + d + e + plot_annotation(title = "Cleaning of P.burs dark 16x")
dd16_2022feb_decrease <- rbind(dd16_2022feb_decrease, morph_mvt)
# filtering
dd16_2022feb_decrease <- dd16_2022feb_decrease %>%
filter(net_disp>50, net_speed>5, N_frames>10)
dd16_2022feb_decrease$magnification <- 1.6
# table(dd16_2022feb_decrease$species)
load("../../Class_18C_decreasingLight/Light_10perc/data/16x/large_ciliates/Morph_mvt.RData")
dd16_20220301_decrease <- morph_mvt
load("../../Class_18C_decreasingLight/Light_10perc/data/16x/small_ciliates/Morph_mvt.RData")
dd16_20220301_decrease <- rbind(dd16_20220301_decrease, morph_mvt)
# filtering
dd16_20220301_decrease <- dd16_20220301_decrease %>%
filter(net_disp>50, net_speed>5, N_frames>10)
dd16_20220301_decrease$magnification <- 1.6
# dd16_20220301_decrease %>%
# ggplot(aes(mean_area))+
# geom_histogram()+
# facet_wrap(~species, scales = "free")
dd16_20220301_decrease <- dd16_20220301_decrease %>%
dplyr::filter(!(species %in% c("Paramecium_bursaria","Euplotes","Paramecium_caudatum") & mean_area<2500),
!(species %in% c("Coleps_irchel") & mean_area<1000),
!(species %in% c("Dexiostoma","Tetrahymena","Loxocephallus") & mean_area>1500))
# table(dd16_20220301_decrease$species)
load("../../Class_18C_decreasingLight/Light_6perc/data/16x/large_ciliates/Morph_mvt.RData")
dd16_20220316_decrease <- morph_mvt
load("../../Class_18C_decreasingLight/Light_6perc/data/16x/small_ciliates/Morph_mvt.RData")
dd16_20220316_decrease <- rbind(dd16_20220316_decrease, morph_mvt)
# filtering
dd16_20220316_decrease <- dd16_20220316_decrease %>%
filter(net_disp>50, net_speed>5, N_frames>10)
dd16_20220316_decrease$magnification <- 1.6
# dd16_20220316_decrease %>%
# ggplot(aes(mean_area))+
# geom_histogram()+
# facet_wrap(~species, scales = "free")
dd16_20220316_decrease <- dd16_20220316_decrease %>%
dplyr::filter(!(species %in% c("Paramecium_bursaria","Euplotes","Paramecium_caudatum") & mean_area<2000),
!(species %in% c("Coleps_irchel") & mean_area<1000),
!(species %in% c("Dexiostoma","Tetrahymena","Loxocephallus") & mean_area>1400))
# table(dd16_20220316_decrease$species)
load("../../Class_18C_decreasingLight/Light_1perc/data/16x/large_ciliates/Morph_mvt.RData")
dd16_20220406_decrease <- morph_mvt
load("../../Class_18C_decreasingLight/Light_1perc/data/16x/small_ciliates/Morph_mvt.RData")
dd16_20220406_decrease <- rbind(dd16_20220406_decrease, morph_mvt)
# filtering
dd16_20220406_decrease <- dd16_20220406_decrease %>%
filter(net_disp>50, net_speed>5, N_frames>10)
dd16_20220406_decrease$magnification <- 1.6
# dd16_20220406_decrease %>%
# ggplot(aes(mean_area))+
# geom_histogram()+
# facet_wrap(~species, scales = "free")
dd16_20220406_decrease <- dd16_20220406_decrease %>%
dplyr::filter(!(species %in% c("Paramecium_bursaria","Euplotes","Paramecium_caudatum") & mean_area<2000),
!(species %in% c("Coleps_irchel") & mean_area<1000),
!(species %in% c("Dexiostoma","Tetrahymena","Loxocephallus") & mean_area>1400))
# table(dd16_20220406_decrease$species)
load("../../Class_18C_decreasingLight/Light_1perc/data_20220706/data/16x/large_ciliates/Morph_mvt.RData")
dd16_20220706_decrease <- morph_mvt
load("../../Class_18C_decreasingLight/Light_1perc/data_20220706/data/16x/small_ciliates/Morph_mvt.RData")
dd16_20220706_decrease <- rbind(dd16_20220706_decrease, morph_mvt)
# filtering
dd16_20220706_decrease <- dd16_20220706_decrease %>%
filter(net_disp>50, net_speed>5, N_frames>10)
dd16_20220706_decrease$magnification <- 1.6
# dd16_20220706_decrease %>%
# ggplot(aes(mean_area))+
# geom_histogram()+
# facet_wrap(~species, scales = "free")
dd16_20220706_decrease <- dd16_20220706_decrease %>%
dplyr::filter(!(species %in% c("Paramecium_bursaria","Euplotes","Paramecium_caudatum") & mean_area<2000),
!(species %in% c("Coleps_irchel") & mean_area<1000),
!(species %in% c("Dexiostoma","Tetrahymena","Loxocephallus") & mean_area>1400))
# table(dd16_20220706_decrease$species)
dd16$species <- ifelse(dd16$species %in% c("Tetrahymena","Dexiostoma","Loxocephallus"),
"Smaller_ciliates",dd16$species)
dd16_2022feb$species <- ifelse(dd16_2022feb$species %in% c("Tetrahymena","Dexiostoma","Loxocephallus"),
"Smaller_ciliates",dd16_2022feb$species)
dd16_20220301$species <- ifelse(dd16_20220301$species %in% c("Tetrahymena","Dexiostoma","Loxocephallus"),
"Smaller_ciliates",dd16_20220301$species)
dd16_20220301_decrease$species <- ifelse(dd16_20220301_decrease$species %in% c("Tetrahymena","Dexiostoma","Loxocephallus"),
"Smaller_ciliates",dd16_20220301_decrease$species)
dd16_20220316$species <- ifelse(dd16_20220316$species %in% c("Tetrahymena","Dexiostoma","Loxocephallus"),
"Smaller_ciliates",dd16_20220316$species)
dd16_20220316_decrease$species <- ifelse(dd16_20220316_decrease$species %in% c("Tetrahymena","Dexiostoma","Loxocephallus"),
"Smaller_ciliates",dd16_20220316_decrease$species)
dd16_20220406$species <- ifelse(dd16_20220406$species %in% c("Tetrahymena","Dexiostoma","Loxocephallus"),
"Smaller_ciliates",dd16_20220406$species)
dd16_20220406_decrease$species <- ifelse(dd16_20220406_decrease$species %in% c("Tetrahymena","Dexiostoma","Loxocephallus"),
"Smaller_ciliates",dd16_20220406_decrease$species)
dd16_20220706$species <- ifelse(dd16_20220706$species %in% c("Tetrahymena","Dexiostoma","Loxocephallus"),
"Smaller_ciliates",dd16_20220706$species)
dd16_20220706_decrease$species <- ifelse(dd16_20220706_decrease$species %in% c("Tetrahymena","Dexiostoma","Loxocephallus"),
"Smaller_ciliates",dd16_20220706_decrease$species)
dd16$data <- "Late 2020"
dd16_2022feb$data <- "20220201"
dd16_2022feb_decrease$data <- "20220201"
dd16_20220301$data <- "20220301"
dd16_20220301_decrease$data <- "20220301"
dd16_20220316$data <- "20220316"
dd16_20220316_decrease$data <- "20220316"
dd16_20220406$data <- "20220406"
dd16_20220406_decrease$data <- "20220406"
dd16_20220706$data <- "20220706"
dd16_20220706_decrease$data <- "20220706"
dd_30perc <- rbind(dd16,dd16_2022feb,
dd16_20220301 %>% dplyr::select(-Light_dark),
dd16_20220316 %>% dplyr::select(-Light_dark),
dd16_20220406 %>% dplyr::select(-Light_dark),
dd16_20220706 %>% dplyr::select(-Light_dark))
dd_18perc <- dd16_2022feb_decrease
dd_10perc <- dd16_20220301_decrease %>%
dplyr::select(-Light_dark)
dd_6perc <- dd16_20220316_decrease %>%
dplyr::select(-Light_dark)
dd_1perc <- rbind(dd16_20220406_decrease %>% dplyr::select(-Light_dark),
dd16_20220706_decrease %>% dplyr::select(-Light_dark))
dd_30perc$light <- "30%"
dd_18perc$light <- "18%"
dd_10perc$light <- "10%"
dd_6perc$light <- "6%"
dd_1perc$light <- "1%"
dd <- rbind(dd_30perc,dd_18perc,dd_10perc,dd_6perc,dd_1perc)
dd$species <- ifelse(dd$species == "Stylonychia_2", "Stylonychia2", dd$species)
dd$species <- ifelse(dd$species %in% c("Stylonychia2_batch1","Stylonychia2_batch2"), "Stylonychia2", dd$species)
# table(dd$species, dd$data, dd$magnification, dd$light)
If an individual is an outlier in more than 3 traits it gets excluded (about 7% gets excluded). Outlier based on boxplot definition.
dd$id <- 1:nrow(dd)
dd <- dd %>% na.omit()
dd_long <- dd %>%
dplyr::select(species, mean_area,sd_area,mean_perimeter,sd_perimeter,mean_major,
sd_major,mean_ar,sd_ar,
mean_turning,sd_turning,gross_disp,max_net,net_disp,net_speed,max_step,
min_step,sd_step,sd_gross_speed,max_gross_speed,min_gross_speed, id,
data, light, magnification) %>%
pivot_longer(cols=-c(id,species,data,light,magnification), names_to="variable") %>%
dplyr::group_by(variable,species,data,light,magnification) %>%
mutate(iqr = IQR(value),
first_quartile = quantile(value, probs = 0.25),
third_quartile = quantile(value, probs = 0.75),
outlier = ifelse(value<first_quartile-1.5*iqr | value>third_quartile+1.5*iqr,T,F))
outliers <- dd_long %>%
dplyr::filter(outlier==T) %>%
dplyr::group_by(species, id, data, light, magnification) %>%
dplyr::summarise(n = n()) %>%
dplyr::filter(n>3)
# table(outliers$species)
# nrow(outliers)/nrow(dd)
dd_filtered <- dd %>%
dplyr::filter(!is.element(id,outliers$id))
dd$removed_outliers <- F
dd_filtered$removed_outliers <- T
dd_comparison <- rbind(dd,dd_filtered)
dd <- dd_filtered
# dd_comparison %>%
# dplyr::filter(magnification==1.6) %>%
# ggplot(aes((mean_area), col=removed_outliers))+
# geom_histogram(aes(fill=removed_outliers, y=..density..), position = "identity", alpha=0.3) +
# # geom_boxplot(outlier.alpha = 0.3) +
# theme_bw() +
# facet_wrap(species~interaction(light,data), scales = "free")
#
# dd_filtered %>%
# ggplot(aes(log10(mean_area)))+
# geom_density(aes(col=species))
# table(dd$species, dd$data, dd$magnification, dd$light)
We have data from several time points and light settings. The goals is that in the final training data these are somewhat equally represented, regardless that the number of individuals tracked per species and time point can vary a lot (in other words: “even it out” across species, time point and light setting). This is important, as otherwise it can be that groups that are more abundant are weighted more.
dd <- dd %>%
mutate(data.light = interaction(data,light, drop = T),
data.light.species=interaction(data,light,species, drop = T))
print("number of individuals per species per date and light setting")
## [1] "number of individuals per species per date and light setting"
tab <- table(dd$data.light, dd$species)
tab
##
## Coleps_irchel Colpidium Euplotes Paramecium_bursaria
## 20220406.1% 43 189 69 332
## 20220706.1% 9 392 175 174
## 20220301.10% 1421 462 68 140
## 20220201.18% 0 0 1404 1464
## 20220201.30% 2038 559 476 960
## 20220301.30% 1072 402 106 141
## 20220316.30% 207 436 286 1461
## 20220406.30% 133 199 236 763
## 20220706.30% 432 382 971 2070
## Late 2020.30% 1120 296 1879 2307
## 20220316.6% 329 468 697 1375
##
## Paramecium_caudatum Smaller_ciliates Stylonychia1 Stylonychia2
## 20220406.1% 199 2 0 237
## 20220706.1% 148 20 0 33
## 20220301.10% 744 352 0 922
## 20220201.18% 0 0 0 0
## 20220201.30% 1134 75 0 1149
## 20220301.30% 1308 494 0 2071
## 20220316.30% 1563 21 0 1243
## 20220406.30% 566 19 0 1017
## 20220706.30% 935 9 0 2944
## Late 2020.30% 2038 30 561 347
## 20220316.6% 1371 33 0 925
print("Sum of individuals per species")
## [1] "Sum of individuals per species"
colsums <- colSums(tab)
colsums
## Coleps_irchel Colpidium Euplotes Paramecium_bursaria
## 6804 3785 6367 11187
## Paramecium_caudatum Smaller_ciliates Stylonychia1 Stylonychia2
## 10006 1055 561 10888
Of course, Stylo1 is the one we the least individuals, because it went extinct and we do not have it anymore. As it went extinct, it shouldn’t be the limiting factor, I think. For now at least I’m taking 2 * Stylo1_sum as the number of individuals per species to be included (if a species/class has less than that all individuals are included).
Within a class I use that number so that roughly equally many individuals across time steps and light settings are included. As a last thing I divide the data into a training dataset and in a test dataset.
n <- min(colsums)*3
print(paste0("Max number of individuals used per species (if there are fewer for a species, then all are used): n=",n))
## [1] "Max number of individuals used per species (if there are fewer for a species, then all are used): n=1683"
num_ind_per_class <- dd %>% group_by(species) %>%
summarize(num_class = length(unique(data.light.species)),
num_ind_per_class = floor(n/num_class)) %>%
select(species,num_ind_per_class)
num_ind_per_class_vec <- c(num_ind_per_class$num_ind_per_class)
names(num_ind_per_class_vec) <- num_ind_per_class$species
set.seed(53)
split_up <- split(dd, f = dd$data.light.species)
split_up <- lapply(split_up, function(x) {
species <- unique(x$species)
x %>% sample_n(ifelse(nrow(x) < num_ind_per_class_vec[species], nrow(x), num_ind_per_class_vec[species]))})
trainingdata <- do.call("rbind", split_up)
print("Subsampled: number of individuals per species per date and light setting")
## [1] "Subsampled: number of individuals per species per date and light setting"
tab2 <- table(trainingdata$data.light, trainingdata$species)
tab2
##
## Coleps_irchel Colpidium Euplotes Paramecium_bursaria
## 20220406.1% 43 168 69 153
## 20220706.1% 9 168 153 153
## 20220301.10% 168 168 68 140
## 20220201.18% 0 0 153 153
## 20220201.30% 168 168 153 153
## 20220301.30% 168 168 106 141
## 20220316.30% 168 168 153 153
## 20220406.30% 133 168 153 153
## 20220706.30% 168 168 153 153
## Late 2020.30% 168 168 153 153
## 20220316.6% 168 168 153 153
##
## Paramecium_caudatum Smaller_ciliates Stylonychia1 Stylonychia2
## 20220406.1% 168 2 0 168
## 20220706.1% 148 20 0 33
## 20220301.10% 168 168 0 168
## 20220201.18% 0 0 0 0
## 20220201.30% 168 75 0 168
## 20220301.30% 168 168 0 168
## 20220316.30% 168 21 0 168
## 20220406.30% 168 19 0 168
## 20220706.30% 168 9 0 168
## Late 2020.30% 168 30 561 168
## 20220316.6% 168 33 0 168
print("Subsampled: Sum of individuals per species")
## [1] "Subsampled: Sum of individuals per species"
colsums2 <- colSums(tab2)
colsums2
## Coleps_irchel Colpidium Euplotes Paramecium_bursaria
## 1361 1680 1467 1658
## Paramecium_caudatum Smaller_ciliates Stylonychia1 Stylonychia2
## 1660 545 561 1545
trainingdata <- trainingdata[complete.cases(trainingdata),]
# A stratified random split of the data
idx_train <- createDataPartition(trainingdata$species,
p = 0.65, # percentage of data as training
list = FALSE)
testdata <- trainingdata[-idx_train,]
trainingdata <- trainingdata[idx_train,]
species <- unique(dd$species)
compositions <- read_csv("../../compositions.csv")
## Rows: 15 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): composition
## dbl (19): richness, Chlamydomonas, Cryptomonas, Monoraphidium, Cosmarium, St...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
comp_name <- compositions$composition
compositions$Smaller_ciliates <- with(compositions,
ifelse(Tetrahymena == 1| Dexiostoma == 1| Loxocephallus == 1, 1, 0))
compositions <- compositions[,species]
compositions_list <- apply(compositions, 1, function(x){
idx <- which(x==1)
names(idx)
})
names(compositions_list) <- comp_name
## filterting out Stylo1
compositions_list <- lapply(compositions_list, function(x){
x[x!="Stylonychia1"]
})
classifier_plot_svm <- function(table, combination.nr){
# cm <- classifier$confusion
cm <- table
ncol <- ncol(cm)
cm <- apply(cm, 1, function(x) x/sum(x))
cm_long <- data.frame(Predicted = factor(rep(rownames(cm),ncol), levels = rownames(cm)),
Truth = factor(rep(colnames(cm), each=ncol), levels = colnames(cm)),
Classification = as.vector(cm))
plot <- cm_long %>%
ggplot(aes(Predicted,Truth,fill=Classification))+
geom_tile() +
geom_text(aes(label = round(Classification, 3)), col="white") +
scale_x_discrete(position = "top") +
scale_y_discrete(limits = rev(levels(cm_long$Truth))) +
scale_fill_viridis(option = "D", end=.8, discrete=FALSE)+
theme(axis.text.x = element_text(angle = 90, hjust = 0))+
theme(legend.text = element_text(size=14),
axis.title = element_text(size=14),
title = element_text(size=16),
axis.text = element_text(size=13))+
labs(title=paste("PPV of",combination.nr),fill="PPV")
return(plot)
}
classifier_assessment_plot <- function(cf, combination.nr){
cf.df <- cf$byClass[,1:4] %>%
as.data.frame() %>%
mutate(Species = gsub("Class: ", "", rownames(cf$byClass[,1:5]))) %>%
rename("NPV" = "Neg Pred Value", "PPV" = "Pos Pred Value",
"Sens" = "Sensitivity", "Spec" = "Specificity") %>%
pivot_longer(cols = 1:4) %>%
mutate(col = ifelse(value>=0.9,"1",
ifelse(value<0.9 & value>=0.8,"2",
ifelse(value<0.8 & value>=0.7,"3","4"))))
plot <- cf.df %>%
ggplot(aes(name,Species,fill=col))+
geom_tile() +
geom_text(aes(label = round(value, 3)), col="white") +
scale_x_discrete(position = "top") +
scale_y_discrete(limits = rev(levels(as.factor(cf.df$Species)))) +
scale_fill_manual(values = c("#006837","#66bd63","#f46d43","#a50026"), breaks = c("1","2","3","4")) +
theme(legend.text = element_text(size=14),
title = element_text(size = 16),
axis.title = element_blank(),
axis.text = element_text(size=13),
legend.position = "none")+
labs(title=combination.nr, fill="")
return(plot)
}
formula <- factor(species) ~ mean_area + sd_area + mean_perimeter + sd_perimeter + mean_major +
sd_major + mean_ar + sd_ar +
mean_turning + sd_turning + gross_disp + max_net + net_disp + net_speed + max_step +
min_step + sd_step + sd_gross_speed + max_gross_speed + min_gross_speed
set.seed(7456)
# classifiers_18c_16x <- list()
# classifiers_18c_16x_plots <- list()
# classifiers_18c_16x_assess_plots <- list()
completeList <- mclapply(1:length(compositions_list), function(i){
sub_trainingdata <- trainingdata %>%
filter(species %in% c(compositions_list[[i]]))
sub_testdata <- testdata %>%
filter(species %in% c(compositions_list[[i]]))
sub_trainingdata$species <- factor(sub_trainingdata$species)
sub_testdata$species <- factor(sub_testdata$species)
# split_up <- split(sub_trainingdata, f = sub_trainingdata$species)
# subsamples <- lapply(split_up, function(x) {
# x %>% sample_n(ifelse(nrow(x) < 500, nrow(x), 500))})
# sub_trainingdata <- do.call("rbind", subsamples)
# A stratified random split of the data
# idx_train <- createDataPartition(sub_trainingdata$species,
# p = 0.7, # percentage of data as training
# list = FALSE)
#
# sub_testdata <- sub_trainingdata[-idx_train,]
# sub_trainingdata <- sub_trainingdata[idx_train,]
obj <- tune(svm, formula, data = sub_trainingdata, kernel="radial",
ranges = list(gamma = 2^(-8:2), cost = 2^(1:10)), probability = TRUE,
tunecontrol = tune.control(sampling = "fix", best.model = T))
classifiers_18c_16x <- obj$best.model
cf <- caret::confusionMatrix(predict(obj$best.model, newdata=sub_testdata %>% select(-species)),
sub_testdata$species)
sub_testdata <- sub_testdata %>%
dplyr::mutate(predicted = predict(obj$best.model, newdata=sub_testdata %>% select(-species)),
correct = species==predicted)
sub_testdata_summ <- sub_testdata %>%
group_by(data, species, light, magnification, correct) %>%
summarize(n = n()) %>%
dplyr::mutate(perc_corr = n/sum(n),
composition = names(compositions_list)[i]) %>%
dplyr::filter(correct==T)
classified_test_data <- sub_testdata_summ
classifiers_18c_16x_plots <- classifier_plot_svm(table = cf$table,
combination.nr = names(compositions_list)[[i]])
if(length(unique(sub_testdata$species))==2){
cf2 <- caret::confusionMatrix(predict(obj$best.model, newdata=sub_testdata %>% select(-species)),
sub_testdata$species, positive=as.character(unique(sub_testdata$species)[2]))
cf3 <- rbind(cf$byClass,cf2$byClass)
rownames(cf3) <- unique(sub_testdata$species)
cf$byClass <- cf3
}
classifiers_18c_16x_assess_plots <- classifier_assessment_plot(cf, names(compositions_list)[[i]])
list(classifiers_18c_16x, classified_test_data, classifiers_18c_16x_plots, classifiers_18c_16x_assess_plots)
}, mc.cores = detectCores()-3)
classifiers_18c_16x <- map(completeList, 1)
classified_test_data <- map(completeList, 2)
classifiers_18c_16x_plots <- map(completeList, 3)
classifiers_18c_16x_assess_plots <- map(completeList, 4)
classified_test_data <- do.call("rbind", classified_test_data)
names(classifiers_18c_16x_plots) <- names(classifiers_18c_16x) <-
names(classifiers_18c_16x_assess_plots) <- comp_name
There are mainly 4 measures:
Sensitivity: the probability that an individuals is classified as species X given that the it is of species X: P(Classified as species X | is of species X)
Specificity: the probability that an individuals is not classified as species X given that it is not of species X: P(Not classified as species X | is not of species X)
Positive Predictive Value PPV: the probability that an individual is of species X given that it has been classified as species X: P(is of species X | is classified as species X)
Negative Predictive Value NPV: the probability that an individuals is not of species X given that it has not been classified as species X: P(is not of species X | has not been classified as species X)
PPV is the most important for us!
library(patchwork)
for(i in 1:length(classifiers_18c_16x_plots)){
print(classifiers_18c_16x_plots[[i]] + classifiers_18c_16x_assess_plots[[i]] +
plot_layout(widths = c(4,2)))
}
classified_test_data <- classified_test_data%>%
dplyr::mutate(data=ifelse(data=="Late 2020","2020late",data),
tot_n = n/perc_corr, data=factor(data, ordered = T,
levels=c("2020late","20220201","20220301","20220316","20220406","20220706")),
light_num = as.numeric(gsub("%","",light)))
# classified_test_data %>%
# dplyr::filter(!(species %in% c("Cryptomonas","Debris_and_other"))) %>%
# ggplot(aes(data, perc_corr, col=composition, shape=light, size=tot_n)) +
# geom_hline(yintercept = 0.8, col="red") +
# geom_hline(yintercept = 0.9, col="green")+
# geom_point(position = position_jitter(0.2))+
# facet_wrap(~species, nrow = 3)+
# theme_bw() +
# scale_size_continuous(breaks = c(1,10,20,30)) +
# labs(size="Sample size of evaluation data", y="Correct identification in percentage", x="Date of data collection",
# title="Species identification across date, light intensities and compositions")
classified_test_data %>%
dplyr::filter(!(species %in% c("Cryptomonas","Debris_and_other"))) %>%
ggplot(aes(light_num, perc_corr))+
geom_hline(yintercept = 0.8, col="red") +
geom_point(aes(fill=composition,size=tot_n),shape=21,col="black",alpha=0.3)+
facet_wrap(~species, nrow = 3)+
scale_size_continuous(breaks = c(1,10,20,30)) +
geom_smooth(method = "lm")+
theme_bw() +
labs(size="Sample size of evaluation data", y="Correct species identification in percentage", x="Light intensity in percentage",
title="Species identification across light intensities and compositions")
## `geom_smooth()` using formula 'y ~ x'
# classified_test_data %>%
# dplyr::filter(!(species %in% c("Cryptomonas","Debris_and_other"))) %>%
# ggplot(aes(tot_n,perc_corr, col=species))+
# geom_point()+
# theme_bw()
saveRDS(classifiers_18c_16x, "svm_video_classifiers_18c_16x_20220706_MergedData.rds")
# cl <- readRDS("classifiers_18c_16x.rds")